### Homework 2
library(devtools)
## Loading required package: usethis
library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
### Task 1

# 1.a
setwd("~/Downloads/UniStuff/IE 582/HW2")
muskdata <- read.csv(file="MUSK1.csv",header=TRUE)

pca <- princomp(muskdata[3:168], center = TRUE, scale = TRUE)
## Warning: In princomp.default(muskdata[3:168], center = TRUE, scale = TRUE) :
##  extra arguments 'center', 'scale' will be disregarded
str(pca)
## List of 7
##  $ sdev    : Named num [1:166] 673 379 279 248 221 ...
##   ..- attr(*, "names")= chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
##  $ loadings: 'loadings' num [1:166, 1:166] 0.006074 0.059197 0.066313 -0.07226 -0.000118 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:166] "X42" "X.198" "X.109" "X.75" ...
##   .. ..$ : chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
##  $ center  : Named num [1:166] 38.7 -120 -79.2 16.1 -112.3 ...
##   ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
##  $ scale   : Named num [1:166] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
##  $ n.obs   : int 475
##  $ scores  : num [1:475, 1:166] 133.1 52.3 201.7 146.5 163 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
##  $ call    : language princomp(x = muskdata[3:168], center = TRUE, scale = TRUE)
##  - attr(*, "class")= chr "princomp"
plot(pca$scores[,1],pca$scores[,2],col=muskdata$X1+1,pch=".", cex=7, xlab="First Component", ylab="Second Component", main="PCA")

# If we plot scores of first and second component (with their bag labels color coded), we observe that there are three main clusters. Two of those have both musk and non-musk labels mixed, meaning first two components -even though they can explain 54% of the variance- fail to explain label distribution for these two clusters. The other main cluster consists solely of confirmed musk molecules. The rest of the data which consist of smaller clusters and some random looking elements seems to be seperated with respect to their labels correctly. (No color mix)

ggbiplot(pca)

par(mfrow=c(1,1))
screeplot(pca, type = "l", npcs = 25, main = "Screeplot of the first 25 PCs")
abline(h = 1, col="red", lty=5)

# In this screeplot, red line equals 1, indicating that the data points falling below red line have eigenvalues less than 1 - which occurs after 90% cumulative variance threshold. That split occurs between components 16 and 21. 

par(mfrow=c(1,2))
comp.var <- (pca$sdev)^2
comp.var[1:10]
##    Comp.1    Comp.2    Comp.3    Comp.4    Comp.5    Comp.6    Comp.7 
## 453210.10 143585.73  77718.16  61456.91  49013.16  46715.06  30658.10 
##    Comp.8    Comp.9   Comp.10 
##  23520.93  18169.62  17192.83
p.comp.var <- comp.var/(sum(comp.var))
p.comp.var[1:20]
##      Comp.1      Comp.2      Comp.3      Comp.4      Comp.5      Comp.6 
## 0.407454310 0.129089415 0.069871788 0.055252262 0.044064828 0.041998743 
##      Comp.7      Comp.8      Comp.9     Comp.10     Comp.11     Comp.12 
## 0.027562882 0.021146269 0.016335226 0.015457054 0.012735485 0.011581332 
##     Comp.13     Comp.14     Comp.15     Comp.16     Comp.17     Comp.18 
## 0.011051607 0.008871376 0.008190991 0.007695100 0.006792809 0.006371665 
##     Comp.19     Comp.20 
## 0.006102287 0.005670939
sum(p.comp.var[1:18])
## [1] 0.9015231
plot(p.comp.var)
plot(cumsum(p.comp.var))

# Here we observe that reducing our data two first two components would explain the variance by 54% but in order to account for more variance -lets say 90%- we need to use at least 18 components.

par(mfrow=c(1,1))
cumpro <- cumsum(pca$sdev^2 / sum(pca$sdev^2))
plot(cumpro[1:25], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot")
abline(v = 18, col="blue", lty=5)
abline(h = 0.9, col="blue", lty=5)

# Another look at our PCA components


#### Metric MDS ####

mds <- cmdscale(dist(muskdata[3:168]),eig=TRUE, k=2) # k is the number of dim
x <- mds$points[,1]
y <- mds$points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="Metric MDS", type="n")
text(x, y, labels = row.names(muskdata), cex=.7,col=muskdata$X1+1)

# We obtain the same plot we had when we plotted our data with respect to first two components in PCA part.

## 1.b

setwd("~/Downloads/UniStuff/IE 582/HW2")
muskdata <- read.csv(file="MUSK1.csv",header=TRUE)

means=NULL
for(i in 1:92){means = rbind(means,colMeans(subset(muskdata, X1.1 == i)))}
meanframe = data.frame(means)
avg_pca <- prcomp(means)

par(mfrow=c(1,2))
summary(avg_pca)
## Importance of components:
##                            PC1      PC2      PC3      PC4       PC5
## Standard deviation     476.423 388.1926 315.6617 267.6392 208.86387
## Proportion of Variance   0.323   0.2144   0.1418   0.1019   0.06208
## Cumulative Proportion    0.323   0.5374   0.6792   0.7812   0.84323
##                              PC6       PC7       PC8     PC9     PC10
## Standard deviation     141.53526 132.95915 102.15680 97.4167 79.99774
## Proportion of Variance   0.02851   0.02516   0.01485  0.0135  0.00911
## Cumulative Proportion    0.87174   0.89689   0.91174  0.9253  0.93435
##                            PC11     PC12     PC13     PC14     PC15
## Standard deviation     78.88618 60.64198 56.35427 53.55273 51.85703
## Proportion of Variance  0.00886  0.00523  0.00452  0.00408  0.00383
## Cumulative Proportion   0.94321  0.94844  0.95296  0.95704  0.96087
##                           PC16     PC17     PC18     PC19    PC20     PC21
## Standard deviation     50.9702 46.06380 43.86342 41.64074 39.3357 37.81769
## Proportion of Variance  0.0037  0.00302  0.00274  0.00247  0.0022  0.00204
## Cumulative Proportion   0.9646  0.96758  0.97032  0.97279  0.9750  0.97703
##                            PC22     PC23     PC24    PC25     PC26
## Standard deviation     36.43081 35.30017 32.76942 31.3236 30.31878
## Proportion of Variance  0.00189  0.00177  0.00153  0.0014  0.00131
## Cumulative Proportion   0.97892  0.98069  0.98222  0.9836  0.98492
##                            PC27     PC28     PC29     PC30     PC31
## Standard deviation     28.83568 28.54947 27.69744 26.08912 24.06399
## Proportion of Variance  0.00118  0.00116  0.00109  0.00097  0.00082
## Cumulative Proportion   0.98610  0.98726  0.98836  0.98932  0.99015
##                            PC32     PC33     PC34     PC35     PC36
## Standard deviation     24.00827 22.65363 21.15223 20.99901 19.79506
## Proportion of Variance  0.00082  0.00073  0.00064  0.00063  0.00056
## Cumulative Proportion   0.99097  0.99170  0.99234  0.99296  0.99352
##                            PC37     PC38     PC39     PC40     PC41
## Standard deviation     19.33216 17.99780 17.64630 16.54225 16.32596
## Proportion of Variance  0.00053  0.00046  0.00044  0.00039  0.00038
## Cumulative Proportion   0.99405  0.99451  0.99496  0.99535  0.99573
##                            PC42     PC43     PC44     PC45     PC46
## Standard deviation     15.89539 15.77299 14.17232 13.52522 13.37045
## Proportion of Variance  0.00036  0.00035  0.00029  0.00026  0.00025
## Cumulative Proportion   0.99608  0.99644  0.99672  0.99699  0.99724
##                            PC47     PC48    PC49     PC50     PC51
## Standard deviation     12.89266 12.38336 11.8089 11.46886 11.27417
## Proportion of Variance  0.00024  0.00022  0.0002  0.00019  0.00018
## Cumulative Proportion   0.99748  0.99769  0.9979  0.99808  0.99826
##                            PC52     PC53    PC54    PC55    PC56   PC57
## Standard deviation     10.76037 10.31929 9.56433 9.46493 8.89280 8.4668
## Proportion of Variance  0.00016  0.00015 0.00013 0.00013 0.00011 0.0001
## Cumulative Proportion   0.99843  0.99858 0.99871 0.99883 0.99895 0.9990
##                           PC58    PC59    PC60    PC61    PC62    PC63
## Standard deviation     8.14739 7.61831 7.27496 7.09337 6.88045 6.37558
## Proportion of Variance 0.00009 0.00008 0.00008 0.00007 0.00007 0.00006
## Cumulative Proportion  0.99914 0.99923 0.99930 0.99937 0.99944 0.99950
##                           PC64    PC65    PC66    PC67    PC68    PC69
## Standard deviation     6.04981 5.66512 5.43414 5.32847 4.91153 4.70723
## Proportion of Variance 0.00005 0.00005 0.00004 0.00004 0.00003 0.00003
## Cumulative Proportion  0.99955 0.99960 0.99964 0.99968 0.99971 0.99974
##                           PC70    PC71    PC72    PC73    PC74    PC75
## Standard deviation     4.55197 4.36495 4.00033 3.84315 3.77200 3.67564
## Proportion of Variance 0.00003 0.00003 0.00002 0.00002 0.00002 0.00002
## Cumulative Proportion  0.99977 0.99980 0.99982 0.99984 0.99987 0.99988
##                           PC76    PC77    PC78    PC79    PC80    PC81
## Standard deviation     3.34858 3.15935 2.97065 2.76911 2.56367 2.50420
## Proportion of Variance 0.00002 0.00001 0.00001 0.00001 0.00001 0.00001
## Cumulative Proportion  0.99990 0.99991 0.99993 0.99994 0.99995 0.99996
##                           PC82    PC83    PC84    PC85  PC86 PC87  PC88
## Standard deviation     2.16672 2.15720 2.07444 2.02640 1.797 1.64 1.553
## Proportion of Variance 0.00001 0.00001 0.00001 0.00001 0.000 0.00 0.000
## Cumulative Proportion  0.99996 0.99997 0.99998 0.99998 1.000 1.00 1.000
##                         PC89  PC90  PC91      PC92
## Standard deviation     1.479 1.209 1.029 4.645e-14
## Proportion of Variance 0.000 0.000 0.000 0.000e+00
## Cumulative Proportion  1.000 1.000 1.000 1.000e+00
ggbiplot(avg_pca)

avg_mds <- cmdscale(dist(meanframe[3:168]),eig=TRUE, k=2) # k is the number of dim
x <- avg_mds$points[,1]
y <- avg_mds$points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", main="Metric MDS, Averaged frame", type="p",col=meanframe$X1+1
     )
text(x, y, labels = row.names(muskdata), cex=.7,col=meanframe$X1+1)
par(mfrow=c(1,1))

# Here, we observed a different distribution. Three main clusters are still present with two of them having mixed bags with respect to their labeling just like the previous part. The other main cluster which was consisted solely of positive labeled data in the previous part, happened to enter into a region of heterogeinity.
# The cause behind this may be the loss of information resulted by aggregating individual instances to have a group level inference.
# Thus we can comment that taking the average among instances might not be the best idea as it cause information loss.

### Task 2

# 2.1

library("imager")
## Loading required package: magrittr
## 
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
## 
##     add
## The following object is masked from 'package:grid':
## 
##     depth
## The following object is masked from 'package:plyr':
## 
##     liply
## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum
## The following object is masked from 'package:graphics':
## 
##     frame
## The following object is masked from 'package:base':
## 
##     save.image
img <- load.image("newyear.jpg")
plot(img)

# 2.2

minred = min(img[1:256,1:256,1])
mingreen = min(img[1:256,1:256,2])
minblue = min(img[1:256,1:256,3])
maxred = max(img[1:256,1:256,1])
maxgreen = max(img[1:256,1:256,2])
maxblue = max(img[1:256,1:256,3])

noise1<-matrix(runif(256*256,minred,maxred*0.1),nrow=256)
noise2<-matrix(runif(256*256,mingreen,maxgreen*0.1),nrow=256)
noise3<-matrix(runif(256*256,minblue,maxblue*0.1),nrow=256)

par(mfrow=c(1,3))

img[,,1]=img[,,1]+noise1
img[,,2]=img[,,2]+noise2
img[,,3]=img[,,3]+noise3

par(mfrow=c(1,1))
plot(img)

# Noisy image was created by taking minimum and maximum pixel values for each channel into account.
# Each noise value in a channel is distributed uniformly between minimum  pixel value of that channel and 0.1 times maximum pixel value.
# Three noise matrices, one for each channel, was created as instructed and merged into the original image.

par(mfrow=c(1,3))
image(img[,,1])
image(img[,,2])
image(img[,,3])

par(mfrow=c(1,3))
image(img[,,1], col = grey(seq(0, 1, length = 200)))
image(img[,,2], col = grey(seq(0, 1, length = 200)))
image(img[,,3], col = grey(seq(0, 1, length = 200)))

# Origin in a loaded image refers to the top left corner. However, while displaying a matrix with image() function, origin is displayed at the bottom left. Thus, we have a rotated image. I will leave these ones for show but will rotate the rest of the images from now on via a newly defined rotate() function or via sorting matrix byrow=TRUE.
# Note that images above displays each channel seperately taking only the values into account.
# If color is also desired to be displayed, one can present this via equalling values in other channels (apart from channel to be displayed) into zero and display the image.

# 2.3

# 2.3.a

gray.img=grayscale(img)
par(mfrow=c(1,1))
plot(gray.img)

patchdim = 25 
patches <- matrix(, nrow = (256-(patchdim-1))*(256-(patchdim-1)), ncol = patchdim^2)

k=1
for(i in (1+(patchdim-1)/2):(256-(patchdim-1)/2))
  for(j in (1+(patchdim-1)/2):(256-(patchdim-1)/2))
  {patches[k,]=gray.img[(i-(patchdim-1)/2):(i+(patchdim-1)/2),(j-(patchdim-1)/2):(j+(patchdim-1)/2),,]
  k=k+1}

# A for loop was constructed in order to extract patches from the image.
# A variable called patchdim was defined in order to try different patch sizes.

pca <- princomp(patches, center = TRUE, scale = TRUE)
## Warning: In princomp.default(patches, center = TRUE, scale = TRUE) :
##  extra arguments 'center', 'scale' will be disregarded
par(mfrow=c(1,3))
plot(pca$scores[,1],pca$scores[,2])
comp.var <- (pca$sdev)^2
p.comp.var <- comp.var/(sum(comp.var))
p.comp.var[1:10]
##      Comp.1      Comp.2      Comp.3      Comp.4      Comp.5      Comp.6 
## 0.765726669 0.081080823 0.040531882 0.024071144 0.015034930 0.009101686 
##      Comp.7      Comp.8      Comp.9     Comp.10 
## 0.008774755 0.005233861 0.004498384 0.003984406
cumsum(p.comp.var)[1:10]
##    Comp.1    Comp.2    Comp.3    Comp.4    Comp.5    Comp.6    Comp.7 
## 0.7657267 0.8468075 0.8873394 0.9114105 0.9264454 0.9355471 0.9443219 
##    Comp.8    Comp.9   Comp.10 
## 0.9495557 0.9540541 0.9580385
plot(p.comp.var)
plot(cumsum(p.comp.var))

# We see that first component alone can explain more than 3/4 of the variance and first three components are enough to explain nearly 9/10 of the variance of the data.


# 2.3.b

rotate <- function(x) t(apply(x, 2, rev))
par(mfrow=c(1,3))
firstscore=pca$scores[,1]
firstcomp.m=matrix(firstscore,nrow=(256-(patchdim-1)))
image(rotate(firstcomp.m), col = grey(seq(0, 1, length = 200)))

secondscore=pca$scores[,2]
seccomp.m=matrix(secondscore,nrow=(256-(patchdim-1)))
image(rotate(seccomp.m), col = grey(seq(0, 1, length = 200)))

thirdscore=pca$scores[,3]
thirdcomp.m=matrix(thirdscore,nrow=(256-(patchdim-1)))
image(rotate(thirdcomp.m), col = grey(seq(0, 1, length = 200)))

# Scores of the first component provides a good silhouette, as if we see the image with a myopic POV.
# Second and third components, expectedly, provides a less clear image compared to the precision of the first component.
# Also, I tried different patch sizes, at 5x5, scores of the first component provides more than a silhouette. Smaller patch sizes would expectedly provide a clearer dislay as it would converge to the original image.

# 2.3.c

par(mfrow=c(1,3))
image(matrix(pca$loadings[,1],nrow=patchdim,byrow=TRUE),col = grey(seq(0, 1, length = 200)))
image(matrix(pca$loadings[,2],nrow=patchdim,byrow=TRUE),col = grey(seq(0, 1, length = 200)))
image(matrix(pca$loadings[,3],nrow=patchdim,byrow=TRUE),col = grey(seq(0, 1, length = 200)))

# Eigenvectors are extracted from PCA via $loadings.
# In these plots, bolder regions refer to a higher value of an eigenvector than clear regions.
# Each component displays the part of the patch corresponding to their bolder regions more accurately than other parts.
# For example, first component mostly explains the information on the edges of an image, a patch.